import polars as pl
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skewnorm, norm
import warnings
warnings.simplefilter(action='ignore')
from neuralforecast import NeuralForecast
from neuralforecast.models import NBEATSx, NHITS
#from neuralforecast.losses import MAE
from neuralprophet import NeuralProphet
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, r2_score, f1_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
Inflation(GDP Deflator): Inflation is defined as a prolonged increase in the general price level of goods and services that erodes money's purchasing power. This might be caused by excessive aggregate demand, increased production costs (cost-push inflation), or central bank policy. Inflation can erode purchasing power, cause economic instability, and amplify income inequality. Central banks combat inflation by raising interest rates, selling government bonds, and increasing reserve requirements. Understanding inflation is critical for making informed decisions about saving and investing.
Unemployment rate: The unemployment rate is defined as the percentage of the labor force that is actively looking for work but is unable to find it, representing the state of the economy and job availability.The unemployment rate has a substantial impact on financial markets such as stocks, gold, and Bitcoin.High unemployment can lead to lower stock prices as consumer spending and corporate profits decline. Gold often shines in such times due to its safe-haven appeal, while Bitcoin can rise as an alternative investment amidst turmoil. Low unemployment, on the other hand, could prop up stock prices thanks to increased consumer spending and stronger corporate profits. Gold might lose its luster as economic strength reduces the need for safe havens, while Bitcoin's fate could be mixed based on investor risk appetite and its adoption potential.
GDP growth rate: The gross domestic product (GDP) growth rate, which measures the percentage change in the total market value of goods and services produced within a country over a specific period, serves as a crucial indicator of economic health and activity. Its influence on the financial market is significant. Increased economic activity leads to better business earnings, which drives up stock prices and boosts investor confidence. High-growth periods can raise risk appetite, thereby undermining gold's appeal as a haven asset. The link between GDP growth and cryptocurrency prices is complicated and changing.
GDP: Gross domestic product (GDP) represents the total monetary value of all final goods and services produced within a country during a specific period. This critical economic indicator influences a wide range of financial markets, including stocks, gold, and even Bitcoin. Economic growth fuels stock prices as corporate profits rise, while stagnant economies can lead to downturns. Gold is a safe investment during times of uncertainty, but its demand may wane during strong economic periods. Bitcoin's relationship with GDP is complex and requires further investigation. Understanding these dynamics empowers informed decision-making in the financial market.
Interest rate: Interest rates, or the cost of borrowing money, are a significant influence in the financial landscape, having enormous effects across a wide range of asset classes. Interest rates influence investment decisions, economic activity, and, ultimately, the functioning of financial markets by changing the cost of lending. Lower interest rates encourage borrowing and investment, potentially leading to higher business valuations and higher stock prices. Higher rates, on the other hand, discourage borrowing and investment, putting downward pressure on stock prices and necessitating a reconsideration of business values. Gold is sometimes used as an edge against inflation, with higher interest rates potentially benefiting from inflationary periods. Higher interest rates, on the other hand, promote investment in other asset types, thereby undermining gold's appeal.
CPI: The Consumer Price Index (CPI) tracks price fluctuations in a basket of products and services commonly purchased by urban consumers. This critical indicator serves as an inflation measure, influencing numerous areas of the financial market. Rising CPI means increased inflation, which may reduce company profits and discourage investment, potentially resulting in lower stock prices. Gold has traditionally been used to limit inflation, typically profiting from periods of rising CPI because its price rises in accordance with inflation.
Closing Price: The actual dataset had the opening, closing, low and high prices for each record. However we decided to move forward with the closing price instead. The closing price is the value of the last transacted price before the market officially closes for trading. However for Bitcoin, closing price generally refers to the price at 11:59 PM UTC of any given day.
df = pl.read_csv("SnP500.csv")
df.describe()
| statistic | Date | Close | Volume | Inflation | Unemployment | GDP_Growth_Rate | GDP | Interest_rate | CPI |
|---|---|---|---|---|---|---|---|---|---|
| str | str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| "count" | "7555" | 7555.0 | 7555.0 | 7555.0 | 7555.0 | 7555.0 | 7555.0 | 7555.0 | 7555.0 |
| "null_count" | "0" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| "mean" | null | 1616.111603 | 2.6369e9 | 2.15176 | 5.752548 | 2.463843 | 1.4420e13 | 3.317614 | 2.489871 |
| "std" | null | 977.093422 | 1.7951e9 | 1.168099 | 1.627849 | 1.824953 | 5.0062e12 | 2.132803 | 1.441926 |
| "min" | "1/10/1993" | 429.049988 | 1.499e7 | 0.640955 | 3.65 | -2.767803 | 6.8600e12 | -1.189357 | -0.355546 |
| "25%" | null | 1029.030029 | 9.81e8 | 1.558531 | 4.62 | 1.841875 | 1.0300e13 | 2.023885 | 1.622223 |
| "50%" | null | 1301.349976 | 2.8136e9 | 1.89961 | 5.45 | 2.70637 | 1.4500e13 | 2.960506 | 2.33769 |
| "75%" | null | 2050.629883 | 3.8921e9 | 2.37034 | 6.17 | 3.772565 | 1.8200e13 | 4.89831 | 2.951657 |
| "max" | "9/9/2022" | 4796.560059 | 1.1456e10 | 7.005276 | 9.63 | 5.945485 | 2.5500e13 | 7.148178 | 8.0028 |
df['Date']
| Date |
|---|
| str |
| "4/1/1993" |
| "5/1/1993" |
| "6/1/1993" |
| "7/1/1993" |
| "8/1/1993" |
| … |
| "23-12-2022" |
| "27-12-2022" |
| "28-12-2022" |
| "29-12-2022" |
| "30-12-2022" |
# Function to normalize date formats
def normalize_date(date_str):
for fmt in ("%d-%m-%Y", "%d/%m/%Y"):
try:
return datetime.strptime(date_str, fmt).strftime("%d-%m-%Y")
except ValueError:
continue
raise ValueError(f"Date format for {date_str} not recognized")
# Apply the normalization function to the Date column
df1 = df.with_columns(
pl.col("Date").apply(normalize_date)
)
df1
| Date | Close | Volume | Inflation | Unemployment | GDP_Growth_Rate | GDP | Interest_rate | CPI |
|---|---|---|---|---|---|---|---|---|
| str | f64 | i64 | f64 | f64 | f64 | f64 | f64 | f64 |
| "04-01-1993" | 435.380005 | 201210000 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| "05-01-1993" | 434.339996 | 240350000 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| "06-01-1993" | 434.519989 | 295240000 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| "07-01-1993" | 430.730011 | 304850000 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| "08-01-1993" | 429.049988 | 263470000 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| … | … | … | … | … | … | … | … | … |
| "23-12-2022" | 3844.820068 | 2819280000 | 7.005276 | 3.65 | 2.061593 | 2.5500e13 | 0.0 | 8.0028 |
| "27-12-2022" | 3829.25 | 3030300000 | 7.005276 | 3.65 | 2.061593 | 2.5500e13 | 0.0 | 8.0028 |
| "28-12-2022" | 3783.219971 | 3083520000 | 7.005276 | 3.65 | 2.061593 | 2.5500e13 | 0.0 | 8.0028 |
| "29-12-2022" | 3849.280029 | 3003680000 | 7.005276 | 3.65 | 2.061593 | 2.5500e13 | 0.0 | 8.0028 |
| "30-12-2022" | 3839.5 | 2979870000 | 7.005276 | 3.65 | 2.061593 | 2.5500e13 | 0.0 | 8.0028 |
df1["Date"]
| Date |
|---|
| str |
| "04-01-1993" |
| "05-01-1993" |
| "06-01-1993" |
| "07-01-1993" |
| "08-01-1993" |
| … |
| "23-12-2022" |
| "27-12-2022" |
| "28-12-2022" |
| "29-12-2022" |
| "30-12-2022" |
# Convert the 'Date' column to datetime
df1 = df1.with_columns(
pl.col("Date").str.strptime(pl.Date))
df1["Date"]
| Date |
|---|
| date |
| 1993-01-04 |
| 1993-01-05 |
| 1993-01-06 |
| 1993-01-07 |
| 1993-01-08 |
| … |
| 2022-12-23 |
| 2022-12-27 |
| 2022-12-28 |
| 2022-12-29 |
| 2022-12-30 |
# Extract year and quarter from the date
df1 = df1.with_columns([
pl.col("Date").dt.year().alias("Year"),
pl.col("Date").dt.quarter().alias("Quarter")
])
# Group by year and quarter and calculate the mean for each group
quarterly_df = df1.groupby(["Year", "Quarter"]).mean()
# Drop the Date column and rename the columns appropriately
quarterly_df = quarterly_df.drop("Date").rename({"Year": "Year", "Quarter": "Quarter"}).sort(["Year", "Quarter"])
quarterly_df
| Year | Quarter | Close | Volume | Inflation | Unemployment | GDP_Growth_Rate | GDP | Interest_rate | CPI |
|---|---|---|---|---|---|---|---|---|---|
| i32 | i8 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| 1993 | 1 | 442.750321 | 2.6597e8 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| 1993 | 2 | 445.505872 | 2.6200e8 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| 1993 | 3 | 453.558748 | 2.5574e8 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| 1993 | 4 | 464.271874 | 2.75129375e8 | 2.37034 | 6.9 | 2.751781 | 6.8600e12 | 3.545617 | 2.951657 |
| 1994 | 1 | 469.213492 | 3.1279e8 | 2.135424 | 6.12 | 4.028793 | 7.2900e12 | 4.89831 | 2.607442 |
| … | … | … | … | … | … | … | … | … | … |
| 2021 | 4 | 4602.108894 | 4.0824e9 | 4.492792 | 5.35 | 5.945485 | 2.3300e13 | -1.189357 | 4.697859 |
| 2022 | 1 | 4463.855477 | 5.0287e9 | 7.005276 | 3.65 | 2.061593 | 2.5500e13 | 0.0 | 8.0028 |
| 2022 | 2 | 4105.667102 | 4.9249e9 | 7.005276 | 3.65 | 2.061593 | 2.5500e13 | 0.0 | 8.0028 |
| 2022 | 3 | 3980.351112 | 4.1903e9 | 7.005276 | 3.65 | 2.061593 | 2.5500e13 | 0.0 | 8.0028 |
| 2022 | 4 | 3851.973501 | 4.3452e9 | 7.005276 | 3.65 | 2.061593 | 2.5500e13 | 0.0 | 8.0028 |
#quarterly_df.write_csv("Quarter.csv")
#quarterly_df = pd.DataFrame(quarterly_df)
#quarterly_df
quarterly_df.columns
['Year', 'Quarter', 'Close', 'Volume', 'Inflation', 'Unemployment', 'GDP_Growth_Rate', 'GDP', 'Interest_rate', 'CPI']
# Convert to Pandas DataFrame
quarterly_df_pandas = quarterly_df.to_pandas()
quarterly_df_pandas
| Year | Quarter | Close | Volume | Inflation | Unemployment | GDP_Growth_Rate | GDP | Interest_rate | CPI | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1993 | 1 | 442.750321 | 2.659718e+08 | 2.370340 | 6.90 | 2.751781 | 6.860000e+12 | 3.545617 | 2.951657 |
| 1 | 1993 | 2 | 445.505872 | 2.620033e+08 | 2.370340 | 6.90 | 2.751781 | 6.860000e+12 | 3.545617 | 2.951657 |
| 2 | 1993 | 3 | 453.558748 | 2.557414e+08 | 2.370340 | 6.90 | 2.751781 | 6.860000e+12 | 3.545617 | 2.951657 |
| 3 | 1993 | 4 | 464.271874 | 2.751294e+08 | 2.370340 | 6.90 | 2.751781 | 6.860000e+12 | 3.545617 | 2.951657 |
| 4 | 1994 | 1 | 469.213492 | 3.127857e+08 | 2.135424 | 6.12 | 4.028793 | 7.290000e+12 | 4.898310 | 2.607442 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 115 | 2021 | 4 | 4602.108894 | 4.082385e+09 | 4.492792 | 5.35 | 5.945485 | 2.330000e+13 | -1.189357 | 4.697859 |
| 116 | 2022 | 1 | 4463.855477 | 5.028659e+09 | 7.005276 | 3.65 | 2.061593 | 2.550000e+13 | 0.000000 | 8.002800 |
| 117 | 2022 | 2 | 4105.667102 | 4.924918e+09 | 7.005276 | 3.65 | 2.061593 | 2.550000e+13 | 0.000000 | 8.002800 |
| 118 | 2022 | 3 | 3980.351112 | 4.190339e+09 | 7.005276 | 3.65 | 2.061593 | 2.550000e+13 | 0.000000 | 8.002800 |
| 119 | 2022 | 4 | 3851.973501 | 4.345159e+09 | 7.005276 | 3.65 | 2.061593 | 2.550000e+13 | 0.000000 | 8.002800 |
120 rows × 10 columns
# Plot 1: Line plot of Close prices over time
fig1 = px.line(quarterly_df_pandas, x='Year', y='Close', color='Quarter', title='Close Prices Over Time')
fig1.show()
# Plot 2: Line plot of GDP over time
fig2 = px.line(quarterly_df_pandas, x='Year', y='GDP', color='Quarter', title='GDP Over Time')
fig2.show()
# Plot 3: Line plot of Inflation over time
fig3 = px.line(quarterly_df_pandas, x='Year', y='Inflation', color='Quarter', title='Inflation Over Time')
fig3.show()
# Plot 4: Line plot of Unemployment over time
fig4 = px.line(quarterly_df_pandas, x='Year', y='Unemployment', color='Quarter', title='Unemployment Over Time')
fig4.show()
# Plot 5: Line plot of Interest Rate over time
fig5 = px.line(quarterly_df_pandas, x='Year', y='Interest_rate', color='Quarter', title='Interest Rate Over Time')
fig5.show()
# Plot 6: Line plot of CPI over time
fig6 = px.line(quarterly_df_pandas, x='Year', y='CPI', color='Quarter', title='CPI Over Time')
fig6.show()
# Plot 7: Line plot of GDP Growth Rate over time
fig7 = px.line(quarterly_df_pandas, x='Year', y='GDP_Growth_Rate', color='Quarter', title='GDP Growth Rate Over Time')
fig7.show()
# Plot 8: Line plot of Volume over time
fig8 = px.line(quarterly_df_pandas, x='Year', y='Volume', color='Quarter', title='Volume Over Time')
fig8.show()
#Dropping unnecessary features for further visualisation
quarterly_df_pd = quarterly_df_pandas.drop(columns=["Year","Quarter"])
#Distplot and Boxplot for each feature - Data distribution and Outlier Detection
plt.figure(figsize=[20,60])
columns = quarterly_df_pd.columns
cnt = 1
for col in columns:
plt.subplot(14, 2, cnt)
sns.distplot(quarterly_df_pd[col], fit=norm)
cnt += 1
plt.subplot(14, 2, cnt)
sns.boxplot(quarterly_df_pd[col])
cnt += 1
plt.tight_layout()
plt.show()
1) Features doesn't seem to align close with normal distribution except for CPI. 2) Close, Inflation, Unemployment, GDP Growth Rate and CPI are having outliers.
# Outlier Treatment - Values below the lower bound are replaced with the lower bound,
# and values above the upper bound are replaced with the upper bound.
# Function to detect and treat outliers using IQR method
def treat_outliers(df, column):
Q1 = df[column].quantile(0.25) # Q1 is the 25th percentile, and
Q3 = df[column].quantile(0.75) # Q3 is the 75th percentile of the data.
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
return df
# List of columns to treat for outliers
columns_to_treat = ['Close', 'Inflation', 'Unemployment', 'GDP_Growth_Rate', 'CPI']
# Apply outlier treatment to each column
for column in columns_to_treat:
quarterly_df_pd_treated = treat_outliers(quarterly_df_pd, column)
quarterly_df_pd_treated.describe()
| Close | Volume | Inflation | Unemployment | GDP_Growth_Rate | GDP | Interest_rate | CPI | |
|---|---|---|---|---|---|---|---|---|
| count | 120.000000 | 1.200000e+02 | 120.000000 | 120.000000 | 120.000000 | 1.200000e+02 | 120.000000 | 120.000000 |
| mean | 1570.962310 | 2.638766e+09 | 2.008767 | 5.673167 | 2.571960 | 1.442433e+13 | 3.316396 | 2.388689 |
| std | 864.223491 | 1.701057e+09 | 0.721140 | 1.471253 | 1.545983 | 5.025693e+12 | 2.140971 | 1.126728 |
| min | 442.750321 | 2.557414e+08 | 0.640955 | 3.650000 | -1.054160 | 6.860000e+12 | -1.189357 | -0.355546 |
| 25% | 1048.111877 | 1.061234e+09 | 1.558531 | 4.620000 | 1.841875 | 1.030000e+13 | 2.023885 | 1.622223 |
| 50% | 1295.423167 | 3.122430e+09 | 1.908772 | 5.400000 | 2.695293 | 1.450000e+13 | 2.776468 | 2.390137 |
| 75% | 2033.477972 | 3.887069e+09 | 2.370340 | 6.170000 | 3.772565 | 1.820000e+13 | 4.898310 | 2.951657 |
| max | 3511.527113 | 6.531582e+09 | 3.588052 | 8.495000 | 5.945485 | 2.550000e+13 | 7.148178 | 4.945808 |
#plt.figure(figsize=[60, 20])
#cnt = 1
#out_col = quarterly_df_pd_treated.columns
#for col in out_col:
# plt.subplot(4, 3, cnt)
# sns.boxplot(quarterly_df_pd_treated[col])
# cnt += 1
#plt.tight_layout()
#plt.show()
# Calculate the correlation matrix
correlation_matrix = quarterly_df_pd_treated.corr()
# Create a heatmap
fig_corr = go.Figure(data=go.Heatmap(
z=correlation_matrix.values,
x=correlation_matrix.columns,
y=correlation_matrix.columns,
colorscale='Viridis'))
fig_corr.update_layout(title='Correlation Heatmap')
fig_corr.show()
# Drop the target variable
independent_variables = quarterly_df_pd_treated.drop(columns=['Unemployment'])
# Calculate VIF for each independent variable
# A high VIF value (typically greater than 10) indicates that the variance
# of the coefficient estimate for that variable is inflated due to multicollinearity.
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(independent_variables.values, i) for i in range(independent_variables.shape[1])]
vif["features"] = independent_variables.columns
vif
| VIF Factor | features | |
|---|---|---|
| 0 | 8.460349 | Close |
| 1 | 8.136021 | Volume |
| 2 | 5.882536 | Inflation |
| 3 | 1.787589 | GDP_Growth_Rate |
| 4 | 158.537938 | GDP |
| 5 | 1.025189 | Interest_rate |
| 6 | 5.205215 | CPI |
# Prepare the data for time series forecasting
quarterly_df_pandas['ds'] = pd.to_datetime(quarterly_df_pandas['Year'].astype(str) + 'Q' + quarterly_df_pandas['Quarter'].astype(str))
quarterly_df_pandas = quarterly_df_pandas.sort_values(by='ds')
#quarterly_df_pandas.set_index('ds', inplace=True)
quarterly_df_pandas.columns
Index(['Year', 'Quarter', 'Close', 'Volume', 'Inflation', 'Unemployment',
'GDP_Growth_Rate', 'GDP', 'Interest_rate', 'CPI', 'ds'],
dtype='object')
# Select the relevant columns
ts_data = quarterly_df_pandas[['ds', 'Unemployment', 'Close', 'Volume', 'Inflation', 'GDP_Growth_Rate', 'Interest_rate', 'CPI']]
ts_data.columns = ['ds', 'y', 'Close', 'Volume', 'Inflation', 'GDP_Growth_Rate', 'Interest_rate', 'CPI']
ts_data['unique_id'] = 1 # Add a unique_id column for the NeuralForecast class
#Feature Scaling
scaler = MinMaxScaler()
columns_to_normalize = ['y', 'Close', 'Volume', 'Inflation', 'GDP_Growth_Rate', 'Interest_rate', 'CPI']
ts_data[columns_to_normalize] = scaler.fit_transform(ts_data[columns_to_normalize])
# Split the data into training and test sets
train_size = int(len(ts_data) * 0.8)
train_data = ts_data[:train_size]
test_data = ts_data[train_size:]
# Initialize the models
nbeatsx_model = NBEATSx(h=len(test_data), input_size= 2*len(test_data), max_steps=50)
nhits_model = NHITS(h=len(test_data), input_size= 2*len(test_data), max_steps=50)
# Initialize the NeuralForecast class
nf = NeuralForecast(models=[nbeatsx_model, nhits_model], freq='Q')
# Fit the models
nf.fit(train_data)
# Make predictions
predictions = nf.predict(test_data)
# Extract predictions for each model
nbeatsx_forecast = predictions[['ds', 'NBEATSx']]
nhits_forecast = predictions[['ds', 'NHITS']]
# Calculate performance metrics for NBEATSx
mae_nbeatsx = mean_absolute_error(test_data['y'], nbeatsx_forecast['NBEATSx'])
mse_nbeatsx = mean_squared_error(test_data['y'], nbeatsx_forecast['NBEATSx'])
rmse_nbeatsx = np.sqrt(mse_nbeatsx)
# Calculate performance metrics for NHITS
mae_nhits = mean_absolute_error(test_data['y'], nhits_forecast['NHITS'])
mse_nhits = mean_squared_error(test_data['y'], nhits_forecast['NHITS'])
rmse_nhits = np.sqrt(mse_nhits)
# Create a DataFrame to return the results in tabular format
results_df = pd.DataFrame({
'Models': ['NBEATSx', 'NHITS'],
'MAE': [mae_nbeatsx, mae_nhits],
'MSE': [mse_nbeatsx, mse_nhits],
'RMSE': [rmse_nbeatsx, rmse_nhits]
})
results_df
Seed set to 1 Seed set to 1
Sanity Checking: | | 0/? [00:00<…
Training: | | 0/? [00:00<…
Validation: | | 0/? [00:00<…
Sanity Checking: | | 0/? [00:00<…
Training: | | 0/? [00:00<…
Validation: | | 0/? [00:00<…
Predicting: | | 0/? [00:00<…
Predicting: | | 0/? [00:00<…
| Models | MAE | MSE | RMSE | |
|---|---|---|---|---|
| 0 | NBEATSx | 0.181878 | 0.074991 | 0.273845 |
| 1 | NHITS | 0.210793 | 0.060834 | 0.246646 |
Given that MSE and RMSE are often considered more important metrics because they penalize larger errors more heavily, NHITS can be considered the better model overall in this case.
train_data.set_index('ds', inplace=True)
test_data.set_index('ds', inplace=True)
predictions.set_index('ds', inplace=True)
# Plot the results2
plt.figure(figsize=(10, 6))
plt.plot(train_data['y'], label='Training Data')
plt.plot(test_data['y'], label='Test Data')
plt.plot(predictions['NHITS'], label='Predictions', color='red')
plt.legend()
plt.title('Quarterly Unemployment Forecast')
plt.xlabel('Date')
plt.ylabel('Unemployment')
plt.show()
predictions
| NBEATSx | NHITS | |
|---|---|---|
| ds | ||
| 2022-12-31 | 0.028557 | 0.020406 |
| 2023-03-31 | 0.007411 | -0.003748 |
| 2023-06-30 | 0.014606 | 0.026440 |
| 2023-09-30 | -0.024062 | 0.040145 |
| 2023-12-31 | 0.029705 | 0.055193 |
| 2024-03-31 | -0.037909 | 0.073615 |
| 2024-06-30 | -0.075761 | 0.138846 |
| 2024-09-30 | -0.040313 | 0.174221 |
| 2024-12-31 | -0.016863 | 0.179870 |
| 2025-03-31 | -0.012247 | 0.191009 |
| 2025-06-30 | -0.040432 | 0.217390 |
| 2025-09-30 | 0.037842 | 0.264717 |
| 2025-12-31 | -0.009255 | 0.320639 |
| 2026-03-31 | 0.143441 | 0.371401 |
| 2026-06-30 | 0.189561 | 0.419133 |
| 2026-09-30 | 0.181166 | 0.441882 |
| 2026-12-31 | 0.193167 | 0.430465 |
| 2027-03-31 | 0.228466 | 0.414037 |
| 2027-06-30 | 0.285549 | 0.420930 |
| 2027-09-30 | 0.208643 | 0.443728 |
| 2027-12-31 | 0.173339 | 0.454692 |
| 2028-03-31 | 0.260231 | 0.401967 |
| 2028-06-30 | 0.252845 | 0.365177 |
| 2028-09-30 | 0.162625 | 0.369130 |
# insample_prediction
Y_hat_insample = nf.predict_insample(step_size=len(test_data))
plt.figure(figsize=(10, 5))
plt.plot(Y_hat_insample['ds'], Y_hat_insample['y'], label='True')
plt.plot(Y_hat_insample['ds'], Y_hat_insample['NHITS'], label='Forecast')
plt.axvline(Y_hat_insample['ds'].iloc[-12], color='black', linestyle='--', label='Train-Test Split')
plt.xlabel('Timestamp [t]')
plt.ylabel('')
plt.grid()
plt.legend()
Predicting: | | 0/? [00:00<…
Predicting: | | 0/? [00:00<…
<matplotlib.legend.Legend at 0x224d46fdfd0>
# Select the relevant columns
ts_data1 = quarterly_df_pandas[['ds', 'Unemployment', 'Close', 'Volume', 'Inflation', 'GDP_Growth_Rate', 'GDP', 'Interest_rate', 'CPI']]
ts_data1.columns = ['ds', 'y', 'Close', 'Volume', 'Inflation', 'GDP_Growth_Rate', 'GDP', 'Interest_rate', 'CPI']
ts_data1['unique_id'] = 2 # Add a unique_id column for the NeuralForecast class
#Feature Scaling
scaler = MinMaxScaler()
columns_to_normalize1 = ['y', 'Close', 'Volume', 'Inflation', 'GDP_Growth_Rate', 'GDP', 'Interest_rate', 'CPI']
ts_data1[columns_to_normalize1] = scaler.fit_transform(ts_data1[columns_to_normalize1])
# Split the data into training and test sets
train_size1 = int(len(ts_data1) * 0.8)
train_data1 = ts_data1[:train_size]
test_data1 = ts_data1[train_size:]
# Initialize the models
nbeatsx_model1 = NBEATSx(h=len(test_data1), input_size= 2*len(test_data1), max_steps=50)
nhits_model1 = NHITS(h=len(test_data1), input_size= 2*len(test_data1), max_steps=50)
# Initialize the NeuralForecast class
nf1 = NeuralForecast(models=[nbeatsx_model1, nhits_model1], freq='Q')
# Fit the models
nf1.fit(train_data1)
# Make predictions
predictions1 = nf1.predict(test_data1)
# Extract predictions for each model
nbeatsx_forecast1 = predictions1[['ds', 'NBEATSx']]
nhits_forecast1 = predictions1[['ds', 'NHITS']]
# Calculate performance metrics for NBEATSx
mae_nbeatsx1 = mean_absolute_error(test_data1['y'], nbeatsx_forecast1['NBEATSx'])
mse_nbeatsx1 = mean_squared_error(test_data1['y'], nbeatsx_forecast1['NBEATSx'])
rmse_nbeatsx1 = np.sqrt(mse_nbeatsx1)
# Calculate performance metrics for NHITS
mae_nhits1 = mean_absolute_error(test_data1['y'], nhits_forecast1['NHITS'])
mse_nhits1 = mean_squared_error(test_data1['y'], nhits_forecast1['NHITS'])
rmse_nhits1 = np.sqrt(mse_nhits1)
# Create a DataFrame to return the results in tabular format
results_df1 = pd.DataFrame({
'Models': ['NBEATSx', 'NHITS'],
'MAE': [mae_nbeatsx1, mae_nhits1],
'MSE': [mse_nbeatsx1, mse_nhits1],
'RMSE': [rmse_nbeatsx1, rmse_nhits1]
})
results_df1
Seed set to 1 Seed set to 1
Sanity Checking: | | 0/? [00:00<…
Training: | | 0/? [00:00<…
Validation: | | 0/? [00:00<…
Sanity Checking: | | 0/? [00:00<…
Training: | | 0/? [00:00<…
Validation: | | 0/? [00:00<…
Predicting: | | 0/? [00:00<…
Predicting: | | 0/? [00:00<…
| Models | MAE | MSE | RMSE | |
|---|---|---|---|---|
| 0 | NBEATSx | 0.181878 | 0.074991 | 0.273845 |
| 1 | NHITS | 0.210793 | 0.060834 | 0.246646 |
train_data1.set_index('ds', inplace=True)
test_data1.set_index('ds', inplace=True)
predictions1.set_index('ds', inplace=True)
# Plot the results2
plt.figure(figsize=(10, 6))
plt.plot(train_data1['y'], label='Training Data')
plt.plot(test_data1['y'], label='Test Data')
plt.plot(predictions1['NHITS'], label='Predictions', color='red')
plt.legend()
plt.title('Quarterly Unemployment Forecast')
plt.xlabel('Date')
plt.ylabel('Unemployment')
plt.show()
predictions1
| NBEATSx | NHITS | |
|---|---|---|
| ds | ||
| 2022-12-31 | 0.028557 | 0.020406 |
| 2023-03-31 | 0.007411 | -0.003748 |
| 2023-06-30 | 0.014606 | 0.026440 |
| 2023-09-30 | -0.024062 | 0.040145 |
| 2023-12-31 | 0.029705 | 0.055193 |
| 2024-03-31 | -0.037909 | 0.073615 |
| 2024-06-30 | -0.075761 | 0.138846 |
| 2024-09-30 | -0.040313 | 0.174221 |
| 2024-12-31 | -0.016863 | 0.179870 |
| 2025-03-31 | -0.012247 | 0.191009 |
| 2025-06-30 | -0.040432 | 0.217390 |
| 2025-09-30 | 0.037842 | 0.264717 |
| 2025-12-31 | -0.009255 | 0.320639 |
| 2026-03-31 | 0.143441 | 0.371401 |
| 2026-06-30 | 0.189561 | 0.419133 |
| 2026-09-30 | 0.181166 | 0.441882 |
| 2026-12-31 | 0.193167 | 0.430465 |
| 2027-03-31 | 0.228466 | 0.414037 |
| 2027-06-30 | 0.285549 | 0.420930 |
| 2027-09-30 | 0.208643 | 0.443728 |
| 2027-12-31 | 0.173339 | 0.454692 |
| 2028-03-31 | 0.260231 | 0.401967 |
| 2028-06-30 | 0.252845 | 0.365177 |
| 2028-09-30 | 0.162625 | 0.369130 |
# insample_prediction
Y_hat_insample1 = nf1.predict_insample(step_size=len(test_data1))
plt.figure(figsize=(10, 5))
plt.plot(Y_hat_insample1['ds'], Y_hat_insample1['y'], label='True')
plt.plot(Y_hat_insample1['ds'], Y_hat_insample1['NHITS'], label='Forecast')
plt.axvline(Y_hat_insample1['ds'].iloc[-12], color='black', linestyle='--', label='Train-Test Split')
plt.xlabel('Timestamp [t]')
plt.ylabel('')
plt.grid()
plt.legend()
Predicting: | | 0/? [00:00<…
Predicting: | | 0/? [00:00<…
<matplotlib.legend.Legend at 0x224d48c6ac0>